## 1. Folder structure
## Figures and tables present in the dataset could be replicated by running the main republication file "manuscript_replication.R"
## Anonymized data source lives under the folder "source_data." Generated figures and tables are exported in the root folder


## 2. Setup and load necessary packages

# Required packages

required_packages <- c(
  "tidyverse",
  "ggforce",
  "extrafont",
  "readxl",
  "knitr",
  "MASS",
  "ordinal",
  "gt",
  "sf",
  "ggspatial",
  "classInt",
  "cowplot",
  "ggrepel",
  "ggpattern",
  "pscl",
  "marginaleffects",
  "texreg"
)

# Function to install and load packages

install_and_load <- function(package) {
  if (!require(package, character.only = TRUE)) {
    install.packages(package, dependencies = TRUE)
    library(package, character.only = TRUE)
  }
}

# Apply the function to all required packages
sapply(required_packages, install_and_load)

# Load fonts 
if ("extrafont" %in% installed.packages()) {
  library(extrafont)
  # font_import()  # Only run once or comment out after initial setup. Commented to save time
  loadfonts(device = "win")  # Adjust as needed. Default is Windows, although as the publisher asks for eps, we change it here
}

## 3. Load data

read_rds("ruaf_analysis_full_anon.rds") |> 
  mutate(
    category = factor(category),
    quintile_group = factor(quintile_group),
    others = 1-BelRusUkr_agr, 
    agr_ethnos_f = factor(agr_ethnos),
    agr_ethnos_f = relevel(agr_ethnos_f, ref = "BelRusUkr")
  ) |> 
  rename(
    "Prob_non_es_memo" = "ethn_non_e_slavs",
    "Prob_non_es_bess" = "others",
  )-> ruaf_analysis_full


## 4. Generate figures and tables used in the main body of the manuscript

######################
#### Make Figure 1
######################

# Add Mediazona data, join with ID. Make sure that the source data is clean and nice.

# Load Mediazona data

bbc_mediazona <- read_csv("Russian losses in Ukraine.csv") |> 
  # Add ID column to join with the main dataset
  left_join(
    read_csv("bbc_mediazona_geography.csv"), by = c("region" = "RU_NAME")
  ) |> 
# Exclude those without identified region or foreigners
  filter(
    !region %in% c("###", "Иностранцы")
  ) |> 
  mutate(
    proportion = value/sum(value)*100
  )


### Load necessary shapefiles

### Russia outline source: GADM v2.5. Global Administrative Areas. Available at https://gadm.org/download_country_v3.html
### Raw shapefiles for Russia and Ukraine are manually processed using QGIS.
### Manipulations: merging Crimea and Sevastopol outlines to the Russia outline; merging administrative units into federal districts.

russia_outline <- st_read("RU_polar_epsg_3576.shp") |> 
  left_join(bbc_mediazona, by = "ID_1")

russia_fed_district_outline <- st_read("RU_fed_distr_epsg_3576.shp") |> 
  mutate(
    centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[, 1],
    y = st_coordinates(centroid)[, 2]
  )

### Military infrstructure source: Status 6, v1.5. Available at https://x.com/Archer83Able/status/1773732406991708277
### Raw KMZ file was then converted to a shapefile and processed using QGIS.
### Manipulations: merging all military infrastructure point layers into one layer, cropped out of Russia outline.

mil_infrastructure <- st_read("RU_mun_centroids_w_counts.shp") |>
  filter(NUMPOINTS > 0)

breaks <- classIntervals(russia_outline$proportion, n = 5, style = "jenks")
breaks_points <- classIntervals(mil_infrastructure$NUMPOINTS, n = 5, style = "jenks")

breaks$brks <- c(0.05, 0.5, 1, 2, 3, 5)
breaks_points$brks <- c(1, 2, 5, 10, 20, 70)

russia_outline$share_category <- cut(
  russia_outline$proportion,
  breaks = breaks$brks,
  labels = c("<= 0.5%", "(0.5% - 1%]", "(1% - 2]%", "(2% - 3]%", "> 3%"),
  include.lowest = TRUE
)

mil_infrastructure$size_category <- cut(
  mil_infrastructure$NUMPOINTS,
  breaks = breaks_points$brks,
  labels = c("1", "2-5", "6-10", "11-20", ">20"),
  include.lowest = TRUE
)

legend_data <- data.frame(
  size_category = factor(c(0.2, 0.5, 1, 2, 3), levels = c(0.2, 0.5, 1, 2, 3), labels = c("1", "2-5", "6-10", "11-20", ">20")),
  x = 1:5,
  y = 1:5
)

### Build main and inset maps

ggplot(data = russia_outline) +
  geom_sf(aes(fill = share_category), color = "#383535", linewidth = 0.2) +
  geom_sf(data = russia_fed_district_outline, fill = NA,
          color = "black", linewidth = 0.2) +
  geom_sf(
    data = mil_infrastructure |>
      filter(!NAME_1 %in% c("Moscow City", "Moskva")),
    shape = 21, fill = "#7d8b8f", alpha = 0.35,
    size = mil_infrastructure |> filter(!NAME_1 %in% c("Moscow City", "Moskva")) |> pull(size_category),
    stroke = 0.1
  ) +
  geom_text_repel(
    data = russia_fed_district_outline,
    aes(x = x, y = y, label = toupper(NAME)),
    family = "Arial Narrow",
    size = 2.5,
    color = "black",
    box.padding = 0.5,
    point.padding = 0.5,
    bg.color = "white",
    bg.r = 0.15,
    fontface = "bold"
  ) +
  scale_fill_manual(
    values = c("#d3d3d3", "#a9a9a9", "#808080", "#5c5b5b", "#4d4d4d"),
    name = "Share within the dataset",
    guide = guide_legend(
      override.aes = list(size = 2, shape = 15, stroke = 0.1)
    )
  ) +
  labs(
    caption = "Crimea and Sevastopol are internationally recognized as part of Ukraine. Russia illegally occupied and annexed these territories in 2014.\nMilitary infrastructure in Moscow and the Moscow Oblast are displayed only on the inset map"
  ) +
  geom_point(
    data = legend_data,
    aes(x = x, y = y, size = size_category),
    shape = NA, fill = NA, stroke = NA
  ) +
  scale_size_manual(
    values = c(0.2, 0.5, 1, 2, 3),
    labels = c("1", "2-5", "6-10", "11-20", ">20"),
    guide = guide_legend(
      title = "Aggregate number of military infrastructure\nper municipal division",
      override.aes = list(shape = 21, stroke = 0.1)
    )
  ) +
  theme_minimal() +
  theme(
    text = element_text(family = "Arial Narrow"),
    plot.title = element_text(size = 16, face = "bold", hjust = 1),
    plot.subtitle = element_text(size = 10),
    plot.caption = element_text(size = 12, hjust = 0.8, face = "italic"),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 10),
    legend.position = c("top"),
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.box.just = "left",
    legend.spacing.y = unit(0.05, "cm"),
    legend.byrow = TRUE,
    legend.key.size = unit(0.6, "cm"),
    legend.key.width = unit(0.6, "cm"),
    legend.key.height = unit(0.1, "cm"),
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank()
  ) +
  annotation_scale(
    location = "bl", bar_cols = NULL, width_hint = 0.2, 
    line_width = 0.3, height = unit(0, "cm"), text_family = "Arial Narrow"
  ) -> figure_1_main


## Rotate inset layers

rotate_sf <- function(sf_object, angle) {
  radians <- angle * pi / 180
  rotation_matrix <- matrix(
    c(cos(radians), sin(radians), -sin(radians), cos(radians)),
    nrow = 2, ncol = 2
  )
  st_geometry(sf_object) <- st_geometry(sf_object) * rotation_matrix
  return(sf_object)
}


rotated_map <- rotate_sf(russia_outline, angle = -69)

mil_infrastructure_rotated <- rotate_sf(mil_infrastructure, angle = -69)

rotated_map_fd <- rotate_sf(russia_fed_district_outline, angle = -69) |> 
  mutate(
    centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[, 1],
    y = st_coordinates(centroid)[, 2]
  ) |> 
  mutate(
    NAME = case_when(
      NAME %in% c("Far East", "Siberia") ~ NA_character_,
      T  ~ as.character(NAME)
    )
  )

figure_1_inset <- ggplot(data = rotated_map) +
  geom_sf(aes(fill = share_category), color = "#383535", linewidth = 0.2)+
  geom_sf(data = mil_infrastructure_rotated, shape = 21, fill = "#7d8b8f", alpha = 0.35, size = mil_infrastructure$size_category, stroke = 0.1) +
  geom_sf(data = rotated_map_fd, fill = NA, color = "black", linewidth = 0.2) +
  scale_fill_manual(
    values = c("#d3d3d3", "#a9a9a9", "#808080", "#5c5b5b", "#4d4d4d"),
    name = "Share within the dataset (natural breaks)"
  )+
  geom_text_repel(
    data = rotated_map_fd,
    aes(x = x, y = y, label = toupper(NAME)),
    family = "Arial Narrow",
    size = 2.5,
    color = "black",
    box.padding = 0.5,
    point.padding = 0.5,
    bg.color = "white",
    bg.r = 0.15,
    fontface = "bold"
  ) +
  coord_sf(
    xlim = c(-335307, 2796789), 
    ylim = c(-4700000, -2000000)
  ) +
  annotate(
    geom = "text",
    x = -0,
    y = -4400000,
    label = "European Russia and\nthe North Caucasus",
    family = "Arial Narrow",
    size = 4.5,
    color = "black",
    fontface = "bold",
    hjust = 0.5,
    vjust = 0.5
  ) +
  theme_void() +
  theme(
    legend.position = "none",
    text = element_text(family = "Arial Narrow"),
    plot.background = element_rect(fill = "white", color = "black", linewidth = 1)
  ) +
  annotation_scale(location = "bl", bar_cols = NULL, width_hint = 0.2, height = unit(0, "cm"), line_width = 0.3, text_family = "Arial Narrow")

# figure_1_inset

# Combine maps
figure_1 <- ggdraw() +
  draw_plot(figure_1_main, x = 0.1, y = 0, width = 0.97, height = 0.97) +
  draw_plot(figure_1_inset, x = 0.01, y = 0.55, width = 0.3, height = 0.5)


ggsave("figure_1.pdf", dpi = 300, width = 7015, height = 4960, units = "px")



######################
#### Make Figure 2
######################

ruaf_analysis_full |>
  mutate(
    federal_units = case_when(
      geography == "Г. Краснодар" ~ "Краснодарский край",
      geography == "Г. Красноярск" ~ "Красноярский край",
      geography == "Г. Волгоград" ~ "Волгоградская область",
      geography == "Г. Воронеж" ~ "Свердловская область",
      geography == "Г. Екатеринбург" ~ "Свердловская область",
      geography == "Г. Казань" ~ "Республика Татарстан",
      geography == "Г. Нижний Новгород" ~ "Нижегородская область",
      geography == "Г. Новосибирск" ~ "Новосибирская область",
      geography == "Г. Омск" ~ "Омская область",
      geography == "Г. Пермь" ~ "Пермский край",
      geography == "Г. Ростов-на-Дону" ~ "Ростовская область",
      geography == "Г. Самара" ~ "Самарская область",
      geography == "Г. Уфа" ~ "Республика Башкортостан",
      geography == "Г. Челябинск" ~ "Челябинская область",
      geography == "Коми-Пермяцкий АО" ~ "Республика Коми",
      T ~ as.character(geography)
    )
  ) |> 
  group_by(federal_units) |> 
  count() |> 
  ungroup() |> 
  mutate(
    proportion = (n / sum(n))*100
  ) -> prop_everything

ruaf_analysis_full |> 
  group_by(geography, federal_district) |> 
  count() -> federal_districts

ruaf_analysis_full |>
  mutate(
    federal_units = case_when(
      geography == "Г. Краснодар" ~ "Краснодарский край",
      geography == "Г. Красноярск" ~ "Красноярский край",
      geography == "Г. Волгоград" ~ "Волгоградская область",
      geography == "Г. Воронеж" ~ "Свердловская область",
      geography == "Г. Екатеринбург" ~ "Свердловская область",
      geography == "Г. Казань" ~ "Республика Татарстан",
      geography == "Г. Нижний Новгород" ~ "Нижегородская область",
      geography == "Г. Новосибирск" ~ "Новосибирская область",
      geography == "Г. Омск" ~ "Омская область",
      geography == "Г. Пермь" ~ "Пермский край",
      geography == "Г. Ростов-на-Дону" ~ "Ростовская область",
      geography == "Г. Самара" ~ "Самарская область",
      geography == "Г. Уфа" ~ "Республика Башкортостан",
      geography == "Г. Челябинск" ~ "Челябинская область",
      geography == "Коми-Пермяцкий АО" ~ "Республика Коми",
      T ~ as.character(geography)
    )
  ) |> 
  group_by(federal_units) |> 
  count() |> 
  ungroup() |>
  filter(!federal_units %in% c("Заграница", "Федеральная миграционная служба", "Чечено-Ингушская АССР"), !is.na(federal_units)) |> # Exclude outside Russia and Checheno-Ingushetia (no corresponding region in current admin. division)
  mutate(
    proportion = (n / sum(n))*100
  ) |> 
  left_join(
    federal_districts |> dplyr::select(geography, federal_district), by = c("federal_units" = "geography")
  ) |>
  mutate(
    federal_district = case_when(
      federal_district == "Northeast" ~ "Northwest",
      T ~ as.character(federal_district)
    )
  )-> prop_classified

# Read in the file that has compatibility keys for shapefiles
geocodes <- read_csv("ruaf_geography.csv")

prop_classified |> 
  left_join(geocodes, by = "federal_units") |> 
  mutate(
    proportion = (n/sum(n))*100
  ) -> ruaf_mapping_data

russia_outline |> 
  dplyr::select(-c(proportion, share_category)) |> 
  left_join(
    ruaf_mapping_data |> dplyr::select(ID_1, proportion), by = "ID_1"
  )-> ruaf_analysis_outline

## Here too, classify, examine and "prettyfy"

breaks <- classIntervals(ruaf_analysis_outline$proportion, n = 5, style = "jenks")

breaks$brks <- c(0, 0.05, 1, 3, 10, 21)


ruaf_analysis_outline$share_category <- cut(
  ruaf_analysis_outline$proportion,
  breaks = breaks$brks,
  labels = c("<= 0.5%", "(0.5% - 1%]", "(1% - 3]%", "(3% - 10]%", "> 10%"),
  include.lowest = TRUE
)

### Build main and inset maps

ggplot(data = ruaf_analysis_outline) +
  geom_sf(aes(fill = share_category), color = "#383535", linewidth = 0.2)+
  geom_sf(data = russia_fed_district_outline, fill = NA, color = "black", linewidth = 0.2) +
  scale_fill_manual(
    values = c("#d3d3d3", "#a9a9a9", "#808080", "#5c5b5b", "#4d4d4d"),
    name = "Share within the dataset",
    guide = guide_legend(
      override.aes = list(size = 2, shape = 15, stroke = 0.1)
    )
  ) +
  labs(
    caption = "Crimea and Sevastopol are internationally recognized as part of Ukraine. Russia illegally occupied and annexed these territories in 2014."
  ) +
  geom_text_repel(
    data = russia_fed_district_outline,
    aes(x = x, y = y, label = toupper(NAME)),
    family = "Arial Narrow",
    size = 3,
    color = "black",
    box.padding = 0.5,
    point.padding = 0.5,
    bg.color = "white",
    bg.r = 0.15,
    fontface = "bold"
  ) +
  theme_minimal() +
  theme(
    text = element_text(family = "Arial Narrow"),
    plot.title = element_text(size = 16, face = "bold", hjust = 1),
    plot.subtitle = element_text(size = 10),
    plot.caption = element_text(size = 12, hjust = 0.8, face = "italic"),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 10),
    legend.position = c("top"),
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.box.just = "left",
    legend.spacing.y = unit(0.05, "cm"),
    legend.byrow = TRUE,
    legend.key.size = unit(0.6, "cm"),
    legend.key.width = unit(0.6, "cm"),
    legend.key.height = unit(0.1, "cm"),
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank()
  ) +
  annotation_scale(location = "bl", bar_cols = NULL, width_hint = 0.2, height = unit(0, "cm"), line_width = 0.3, text_family = "Arial Narrow") -> figure_2_main

rotated_map_2 <- rotate_sf(ruaf_analysis_outline, angle = -69)

figure_2_inset <- ggplot(data = rotated_map_2) +
  geom_sf(aes(fill = share_category), color = "#383535", linewidth = 0.2)+
  geom_sf(data = rotated_map_fd, fill = NA, color = "black", linewidth = 0.2) +
  scale_fill_manual(
    values = c("#d3d3d3", "#a9a9a9", "#808080", "#5c5b5b", "#4d4d4d"),
    name = "Share within the dataset (natural breaks)"
  )+
  geom_text_repel(
    data = rotated_map_fd,
    aes(x = x, y = y, label = toupper(NAME)),
    family = "Arial Narrow",
    size = 2.5,
    color = "black",
    box.padding = 0.5,
    point.padding = 0.5,
    bg.color = "white",
    bg.r = 0.15,
    fontface = "bold"
  ) +
  coord_sf(
    xlim = c(-335307, 2796789), 
    ylim = c(-4700000, -2000000)
  ) +
  annotate(
    geom = "text",
    x = -0,
    y = -4400000,
    label = "European Russia and\nthe North Caucasus",
    family = "Arial Narrow",
    size = 4.5,
    color = "black",
    fontface = "bold",
    hjust = 0.5,
    vjust = 0.5
  ) +
  theme_void() +
  theme(
    legend.position = "none",
    text = element_text(family = "Arial Narrow"),
    plot.background = element_rect(fill = "white", color = "black", linewidth = 1)
  ) +
  annotation_scale(location = "bl", bar_cols = NULL, width_hint = 0.2, height = unit(0, "cm"), line_width = 0.3, text_family = "Arial Narrow")


# Combine maps
figure_2 <- ggdraw() +
  draw_plot(figure_2_main, x = 0.1, y = 0, width = 0.97, height = 0.97) +
  draw_plot(figure_2_inset, x = 0.01, y = 0.55, width = 0.3, height = 0.5)

ggsave("figure_2.pdf", dpi = 300, width = 7015, height = 4960, units = "px")


######################
#### Make Figure 3 ###
######################

ruaf_analysis_full |> 
  filter(prob_95pct!= "Not classified", !is.na(rank_large_f)) |>
  group_by(rank_large_f, prob_95pct) |>
  count() |> 
  ungroup() |> 
  pivot_wider(names_from = rank_large_f, values_from = n) |> 
  mutate(
    total = rowSums(
      across(
        c(`Enlisted`, `Sergeants`, `Junior officers`, `Senior officers`, `General officers`), ~ .),
      na.rm = TRUE
    )
  ) |> 
  arrange(desc(total)) |>
  mutate(
    ethnos_group = case_when(
      prob_95pct %in% c("Eastern Slav") ~ "Eastern Slavs",
      prob_95pct %in% c("National minority") ~ "Others (Ethnic Soldiers)"
    )
  ) |> 
  group_by(
    ethnos_group
  ) |> 
  summarize(
    `Enlisted` = sum(`Enlisted`, na.rm = TRUE),
    `Sergeants` = sum(`Sergeants`, na.rm = TRUE),
    `Junior officers` = sum(`Junior officers`, na.rm = TRUE),
    `Senior officers` = sum(`Senior officers`, na.rm = TRUE),
    `General officers` = sum(`General officers`, na.rm = TRUE),
  ) |> 
  pivot_longer(
    cols = -ethnos_group,
    names_to = "rank",
    values_to = "count"
  ) |> 
  mutate(
    rank = fct_relevel(rank, c("Enlisted", "Sergeants", "Junior officers", "Senior officers", "General officers"))
  )-> figure_3_data

figure_3_data

arrow_labels <- tibble(
  ethnos_group = c("Eastern Slavs", "Others (Ethnic Soldiers)"),
  x = c(5.21, 5.21),
  y = c(0.12, 0.12),
  label = c("47", "3")
)


figure_3_data |>
  group_by(ethnos_group) |>
  mutate(
    prop  = count / sum(count),
    pop_pos = prop*0.5,
    pop_neg = prop*-0.5,
    count_label = case_when(
      rank == "General officers" ~ NA_real_,
      T ~ as.double(count)
    )
  ) |> 
  ggplot() +
  geom_col(aes(rank, pop_pos)) +
  geom_col(aes(rank, pop_neg)) +
  geom_text(
    aes(
      x = rank,
      y = 0,
      label = scales::comma(count_label),
    ),
    family = "Arial Narrow", size = 3,
    color = "white"
  ) +
  annotate(
    "segment",
    x = 5.2,
    y = 0.1,
    xend = 5,
    yend = 0,
    arrow = arrow(angle = 5, length = unit(0.2, "inches"), type = "closed"),
    linewidth = 0.2
  ) +
  geom_text(
    data = arrow_labels,
    aes(x = x, y = y, label = label),
    inherit.aes = FALSE,
    family = "Arial Narrow",
    size = 3
  ) +
  scale_fill_manual(values = c("ES" = "#a9a9a9", "NESE" = "#5c5b5b")) +
  facet_wrap(~ethnos_group) + 
  coord_flip() +
  labs(
    caption = "Using Memorial classifier"
  ) +
  theme_bw() +
  theme(
    legend.position = "none",
    plot.caption = element_text(family = "Arial Narrow", size = 7),
    axis.text = element_text(family = "Arial Narrow", size = 11),
    axis.title = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    strip.text = element_text(family = "Arial Narrow", size = 14),
    strip.placement = "outside",
    strip.background = element_blank()
  )
  

ggsave("figure_3.pdf", width = 8.5, height = 3, dpi = 600)



######################
#### Make Table 1 ####
######################


ruaf_analysis_full |> 
  filter(!is.na(rank_large_f)) |> 
  group_by(rank_large_f, ethnos) |>
  count() |> 
  ungroup() |> 
  pivot_wider(names_from = rank_large_f, values_from = n) |> 
  mutate(
    total = rowSums(
      across(
        c(`Enlisted`, `Sergeants`, `Junior officers`, `Senior officers`, `General officers`), ~ .),
        na.rm = TRUE
    )
  ) |> 
  arrange(desc(total)) |>
  mutate(
    ethnos = case_when(
      ethnos == "KabardinAdyghe" ~ "Kabardin/Adyghe",
      ethnos == "KarachayBalkar" ~ "Karachay/Balkar",
      ethnos == "INVALID" ~ "Not classified",
      T ~ as.character(ethnos)
    ),
    ethnos_group = case_when(
      ethnos %in% c("Russian", "Belarusian", "Ukrainian") ~ "ES",
      TRUE ~ "NESE"
    )
  ) |> 
  mutate(
    across(
      c(`Enlisted`, `Sergeants`, `Junior officers`, `Senior officers`, `General officers`), ~ scales::percent(. / total, accuracy = 0.1)
    ),
    across(
      c(`Enlisted`, `Sergeants`, `Junior officers`, `Senior officers`, `General officers`), ~ if_else(is.na(.), "...", .)
    )
  ) -> table_1_data

table_1_data |> 
  dplyr::select(-ethnos_group) |> 
  gt(rowname_col = "ethnos") |> 
  tab_spanner(
    label = "Ranks",
    columns = vars(`Enlisted`, `Sergeants`, `Junior officers`, `Senior officers`, `General officers`)
  ) |> 
  tab_row_group(
    label = "NESE",
    rows = ethnos %in% filter(table_1_data, ethnos_group == "NESE")$ethnos
  ) |> 
  tab_row_group(
    label = "ES",
    rows = ethnos %in% filter(table_1_data, ethnos_group == "ES")$ethnos
  ) |> 
  cols_label(
    `Enlisted` = "Enlisted",
    `Sergeants` = "Sergeants",
    `Junior officers` = "Junior Officers",
    `Senior officers` = "Senior Officers",
    `General officers` = "General Officers",
    total = "Total"
  ) -> table_1

table_1

table_1 |> 
  as_latex() |>
  writeLines(con = "table_1.tex")



######################
#### Make Table 2
######################

## Run Models. Note that this takes a considerable amount of time (around 20-30 minutes)

## Model 1 FE: Simple demographic model with Memorial nationalities, fixed effects on geography

mod_1_fe <- ordinal::clmm(rank_large_f~branch+sex + Prob_non_es_memo + (1|geography), data = ruaf_analysis_full)

## Model 1: Simple demographic model with Memorial nationalities

model_1 <- MASS::polr(rank_large_f~branch+sex+Prob_non_es_memo, data = ruaf_analysis_full)

## Model 2: Simple demographic model with Bessudnov et al nationalities

model_2  <- MASS::polr(rank_large_f~branch+sex+birth_cohorts+Prob_non_es_bess, data = ruaf_analysis_full)

## Model 2: Simple demographic model with Bessudnov et al nationalities, geography FEs

model_2_fe  <- ordinal::clmm(rank_large_f~branch+sex+birth_cohorts+Prob_non_es_bess + (1|geography), data = ruaf_analysis_full)

## Model 3: Demographic + birth cohorts + Income/large city + Ethnic/demographic + nat/republic controls, with Memorial nationalities

model_3 <- MASS::polr(rank_large_f~branch+sex+birth_cohorts+Prob_non_es_memo+category+quintile_group + millionaires + national_republics, data = ruaf_analysis_full)

## Model 4: Demographic + Income/large city + Ethnic/demographic + nat/republic controls, with Memorial nationalities X birth cohorts interaction

model_4 <- MASS::polr(rank_large_f~branch+sex+birth_cohorts*Prob_non_es_memo+category+quintile_group + millionaires + national_republics, data = ruaf_analysis_full)


## Impute data for model robustness check where nationality is missing in the Memorial data

ruaf_analysis_full |> 
  mutate(
    all_es = case_when(
      is.na(Prob_non_es_memo) ~ 0,
      T ~ as.double(Prob_non_es_memo)
    ),
    all_nese = case_when(
      is.na(Prob_non_es_memo) ~ 1,
      T ~ as.double(Prob_non_es_memo)
    )
  ) -> ruaf_imp

## Model 5: Same as Model 4, but imputes all unclassified records as NESE

model_5 <- MASS::polr(rank_large_f~branch+sex+birth_cohorts*all_nese+category+quintile_group + millionaires + national_republics, data = ruaf_imp)

## Model 6: Same as Model 4, but imputes all unclassified records as ES

model_6 <- MASS::polr(rank_large_f~branch+sex+birth_cohorts*all_es+category+quintile_group + millionaires + national_republics, data = ruaf_imp)

## Model 7: Full model with all controls, with categorical nationalities based on the Bessudnov et al approach.

model_7 <- MASS::polr(rank_large_f~branch+agr_ethnos_f+sex+birth_cohorts+category+quintile_group + millionaires + national_republics, data = ruaf_analysis_full)



## Assemble table data
row_names <- c(
  "NESE (Memorial) ", "NESE (B et al) ", "Gender Controls ", "Branch Controls ", "Age Cohort Controls ", "Place of Registration: Income/ Large City Controls", "Place of Registration: Ethnic Demography Controls", "Place of Registration: National Republic Controls", "NESE*[b.1975-1980] ", "NESE*[b.1981-1984] ", "NESE*[b.1985-1990] ", "NESE*[b. 1991-1994] ", "NESE*[b. after 1995] ", "Recoding Rule: All unclassified = NESE", "Recoding Rule: All unclassified = ES",  "Tatar/Bashkir", "Jews", "Chechen/Ingush/Daghestani", "Tajik/Uzbek", "Buryat", "Yakut", "McFadden's Pseudo R^2", "N"
)


## Calculate McFadden's Pseudo R^2

model_1_psr2 <- pR2(model_1)
model_2_psr2 <- pR2(model_2)
model_3_psr2 <- pR2(model_3)
model_4_psr2 <- pR2(model_4)
model_5_psr2 <- pR2(model_5)
model_6_psr2 <- pR2(model_6)
model_7_psr2 <- pR2(model_7)


## McFadden's Pseudo R^2 for fixed effects models

# Null model

null_mod <- clmm(rank_large_f ~ 1 + (1 | geography), data = ruaf_analysis_full)
logLik_null <- logLik(null_mod)

# Model 1
logLik_full_mod1_fe <- logLik(mod_1_fe)

# Model 2
logLik_full_mod2_fe <- logLik(model_2_fe)

model_1_fe_r2 <- 1 - as.numeric(logLik_full_mod1_fe / logLik_null)
model_2_fe_r2 <- 1 - as.numeric(logLik_full_mod2_fe / logLik_null)


## Table

table_2_data <- tibble(
  row_name = row_names,
  model_1 = c(
    round(mod_1_fe$coefficients["Prob_non_es_memo"], 2) |> unname(),
    "", "Y", "Y", rep("", 17), model_1_fe_r2|> round(2) |> unname(), mod_1_fe$info$nobs
  ),
  model_2 = c(
    "",
    round(model_2_fe$coefficients["Prob_non_es_bess"], 2) |> unname(),
    "Y", "Y", rep("", 17), model_2_fe_r2|> round(2) |> unname(), model_2_fe$info$nobs
  ),
  model_3 = c(
    round(model_3$coefficients["Prob_non_es_memo"], 2) |> unname(),
    "", "Y", "Y", "Y", "Y", "Y", "Y", rep("", 13), round(model_3_psr2[4], 2) |> unname(), model_3$n
  ),
  model_4 = c(
    paste0(round(model_4$coefficients["Prob_non_es_memo"], 2) |> unname(), "~"),
    "", "Y", "Y", "", "Y", "Y", "Y",
    model_4$coefficients["birth_cohorts1975-1980:Prob_non_es_memo"] |> round(2) |> unname(),
    model_4$coefficients["birth_cohorts1981-1984:Prob_non_es_memo"] |> round(2) |> unname(),
    model_4$coefficients["birth_cohorts1985-1990:Prob_non_es_memo"] |> round(2) |> unname(),
    model_4$coefficients["birth_cohorts1991-1994:Prob_non_es_memo"] |> round(2) |> unname(),
    model_4$coefficients["birth_cohorts1995 or later:Prob_non_es_memo"] |> round(2) |> unname(), rep("", 8),
    round(model_4_psr2[4], 2) |> unname(), model_4$n
  ),
  model_5 = c(
    "",
    "", "Y", "Y", "", "Y", "Y", "Y",
    model_5$coefficients["birth_cohorts1975-1980:all_nese"] |> round(2) |> unname(),
    model_5$coefficients["birth_cohorts1981-1984:all_nese"] |> round(2) |> unname(),
    paste0(model_5$coefficients["birth_cohorts1985-1990:all_nese"] |> round(2) |> unname(),"~"),
    paste0(model_5$coefficients["birth_cohorts1991-1994:all_nese"] |> round(2) |> unname(),"~"),
    model_5$coefficients["birth_cohorts1995 or later:all_nese"] |> round(2) |> unname(), "Y", rep("", 7), round(model_5_psr2[4], 2) |> unname(), model_5$n
  ),
  model_6 = c(
    "",
    "", "Y", "Y", "", "Y", "Y", "Y",
    model_6$coefficients["birth_cohorts1975-1980:all_es"] |> round(2) |> unname(),
    model_6$coefficients["birth_cohorts1981-1984:all_es"] |> round(2) |> unname(),
    paste0(model_6$coefficients["birth_cohorts1985-1990:all_es"] |> round(2) |> unname(), "~"),
    paste0(model_6$coefficients["birth_cohorts1991-1994:all_es"] |> round(2) |> unname(), "~"),
    paste0(model_6$coefficients["birth_cohorts1995 or later:all_es"] |> round(2) |> unname(), "~"), "", "Y", rep("", 6), round(model_6_psr2[4], 2) |> unname(), model_6$n
  ),
  model_7 = c(
    "",
    "", "Y", "Y", "Y", "Y", "Y", "Y", rep("", 7),
    model_7$coefficients["agr_ethnos_fBashTat"] |> round(2) |> unname(),
    model_7$coefficients["agr_ethnos_fJewish"] |> round(2) |> unname(),
    model_7$coefficients["agr_ethnos_fCheDagIng"] |> round(2) |> unname(),
    model_7$coefficients["agr_ethnos_fTajUzb"] |> round(2) |> unname(),
    model_7$coefficients["agr_ethnos_fBuryat"] |> round(2) |> unname(),
    model_7$coefficients["agr_ethnos_fYakut"] |> round(2) |> unname(),
    round(model_7_psr2[4], 2) |> unname(), model_7$n
  )
)

table_2_data |> 
  gt(rowname_col = "row_name") |> 
  cols_label(
    model_1 = "Model 1",
    model_2 = "Model 2",
    model_3 = "Model 3",
    model_4 = "Model 4",
    model_5 = "Model 5",
    model_6 = "Model 6",
    model_7 = "Model 7"
  ) |> 
  tab_spanner(
    label = "",
    columns = c(model_1, model_2, model_3, model_4, model_5, model_6, model_7)
  ) |> 
  tab_style(
    style = cell_text(style = "italic", weight = "bold"),
    locations = cells_body(
      columns = row_name,
      rows = grepl("McFadden's Pseudo R^2", row_name)
    )
  ) |> 
  tab_style(
    style = cell_text(style = "italic"),
    locations = cells_body(
      columns = row_name,
      rows = grepl("^N$", row_name)
    )
  ) |> 
  fmt_markdown(
    columns = row_name,
    rows = grepl("Pseudo R2", row_name)
  ) -> table_2

table_2_latex <- table_2_data |> 
  gt(rowname_col = "row_name") |> 
  cols_label(
    model_1 = "Model 1",
    model_2 = "Model 2",
    model_3 = "Model 3",
    model_4 = "Model 4",
    model_5 = "Model 5",
    model_6 = "Model 6",
    model_7 = "Model 7"
  ) |> 
  tab_spanner(
    label = "",
    columns = c(model_1, model_2, model_3, model_4, model_5, model_6, model_7)
  ) |> 
  tab_style(
    style = cell_text(style = "italic", weight = "bold"),
    locations = cells_body(
      columns = row_name,
      rows = grepl("McFadden's Pseudo R^2", row_name)
    )
  ) |> 
  tab_style(
    style = cell_text(style = "italic"),
    locations = cells_body(
      columns = row_name,
      rows = grepl("^N$", row_name)
    )
  ) |> 
  fmt_markdown(
    columns = row_name,
    rows = grepl("Pseudo R2", row_name)
  ) |> 
  as_latex()

# Add a custom line at the bottom

custom_line <- "\\multicolumn{8}{l}{\\scriptsize All coefficients statistically significant at p.05 or lower unless there is a $\\sim$. NESE refers to 'Non-Eastern Slavic Ethnicities' -- see FN20.} \\\\"

# Insert that custom line after bottomrule
table_2_latex_write <- str_replace(
  table_2_latex,
  regex("(\\\\bottomrule\\s*\\n)"),
  function(match) paste0(match, custom_line, "\n")
)

# Print the modified LaTeX code
writeLines(table_2_latex_write, "table_2.tex")



######################
#### Make Figure 4 ###
######################

# Calculate predicted probabilities for Model 3

marginaleffects::predictions(model_3, datagrid(
  Prob_non_es_memo = c(0.05, 0.95),
  birth_cohorts = "1985-1990",
  category = "Non-Eastern slavs constitute a small minority",
  branch = "Army",
  sex = "Male"
)) |> 
  mutate(
    nationality = factor(Prob_non_es_memo, levels = c(0.05, 0.95), labels = c("ES", "NESE")),
  )  -> model_3_pred_probs


model_3_pred_probs |>
  filter(group != "General officers") |> 
  ggplot(aes(x = nationality, y = estimate, fill = nationality)) +
  geom_col() +
  coord_flip(ylim = c(0, 1)) +
  geom_linerange(aes(ymin = conf.low, ymax = conf.high), linewidth = 0.5) + 
  geom_text(
    aes(
      label = round(estimate, 3)
    ),
    family = "Arial Narrow", size = 5, nudge_y = 0.05
  ) + 
  scale_fill_manual(values = c("ES" = "#a9a9a9", "NESE" = "#5c5b5b")) +
  facet_grid(rows = vars(group), switch = "y", scales = "free_y") +
  labs(
    y = "Probability of having a rank (95% confidence intervals in brackets)",
    caption = "Probabilities based on Model 3 (all controls); Branch is kept at 'Army,' sex at 'Male,' birth cohorts at '1985-1990', regional control at NESE constitute a small minority. Other controls are kept at mean (0)"
  ) +
  theme_bw() +
  theme(
    legend.position = "none",
    plot.caption = element_text(family = "Arial Narrow", size = 8),
    axis.text = element_text(family = "Arial Narrow", size = 12),
    axis.title = element_text(family = "Arial Narrow", size = 14),
    axis.title.y = element_blank(),
    strip.text = element_text(family = "Arial Narrow", size = 16),
    strip.placement = "outside",
    strip.background = element_blank()
  )


ggsave("figure_4.pdf", width = 8.5, height = 5.5, dpi = 600)



######################
#### Make Figure 5 ###
######################


ethnic_groups <- data.frame(
  group = c(
    "Russian\nState\nApparatus", 
    "Eastern Slavic", 
    "Bashkir or Tatar\nJewish", 
    "Chechen, Ingush, or Dagestani\nUzbek or Tajik", 
    "Buryat\nYakut"
  ),
  x = c(0, 0, 0, 0, 0),
  y = c(0, -1.35, -2.5, -3.5, -4.5),
  radius = c(1, 2, 3, 4, 5),
  color = c("white", "black", "black", "black", "black")
)

ggplot() +
  geom_circle(aes(x0 = 0, y0 = 0, r = 5), color = "black", fill = "grey", alpha = 1) +
  geom_circle(aes(x0 = 0, y0 = 0, r = 4), color = "black", fill = "white", alpha = 1) +
  geom_circle(aes(x0 = 0, y0 = 0, r = 3), color = "black", fill = "grey", alpha = 1) +
  geom_circle(aes(x0 = 0, y0 = 0, r = 2), color = "black", fill = "white", alpha = 1) +
  geom_circle(aes(x0 = 0, y0 = 0, r = 1), color = "black", fill = "black", alpha = 1) +
  
  
  geom_text(
    data = ethnic_groups, aes(x = x, y = y, label = group),
    size = 3.5, color = ethnic_groups$color, hjust = 0.5, fontface = "bold", family = "Arial Narrow"
  ) +
  
  annotate("text", family = "Arial Narrow", x = 0, y = 4.5, label = "Least necessary to state maintenance\nand least politically reliable", size = 4, hjust = 0.5) +
  annotate("text", family = "Arial Narrow", x = 0, y = 3.5, label = "Useful to state but of questionable reliability", size = 4, hjust = 0.5) +
  annotate("text", family = "Arial Narrow", x = 0, y = 2.5, label = "Very valuable to state though\nreliability not total", size = 4, hjust = 0.5) +
  annotate("text", family = "Arial Narrow", x = 0, y = 1.3, label = "Most reliable and most\npolitically resourceful", size = 4, hjust = 0.5) +
  
  coord_fixed() +
  xlim(-5, 5) + ylim(-5, 5)+
  theme_void()+
  theme(
    legend.position = "none",
    text = element_text(family = "Arial Narrow"),
    plot.caption = element_text(family = "Arial Narrow", size = 12)
  )+
  labs(
    caption = "Adapted from Enloe (1980, 24)"
  )

ggsave("figure_5.pdf", width = 10, height = 9, dpi = 600)

